#include <x86intrin.h>
#include <stddef.h>

void dgemm_avx(size_t n, double *A, double *B, double *C)
{
	for (size_t i = 0; i < n; ++i) {
		for (size_t j = 0; j < n; j+=4) {
			__m256d c0 = _mm256_load_pd(C + i * n + j); 
			for (size_t k = 0; k < n; ++k) {
				c0 = _mm256_add_pd(c0,
					_mm256_mul_pd(_mm256_broadcast_sd(A + i * n + k),
						_mm256_load_pd(B + k * n + j)));
			}
			_mm256_store_pd(C + i * n + j, c0);
		}
	}
}
